--- title: "PwC Case: Auto Insurance Analysis" date: 2020-06-06 categories: - Data Analytics ---
import pandas as pd
import numpy as np
from datetime import timedelta
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import statsmodels.formula.api as smf
# Pandas settings
pd.options.display.max_columns = None
# Seaborn styles
sns.set(
style='white',
font_scale=1.5,
rc={
'figure.figsize': (14, 8)
}
)
Pre-define cell styles
new_column_style = lambda _: 'background-color: greenyellow'
highlight_style = lambda _: 'background-color: gold'
loss_ratio_style = lambda _: '{ color: indianred }'
seaborn and plotly packages are used for visualizations. To run the notebook, these packages should be installed.seaborn should be straightforward (pip install seaborn or conda install seaborn).plotly, please refer to https://plotly.com/python/getting-started/Each sheet from the Excel file has been separated into separate .csv files. While reading csv files, I'll parse datetime type columns when reading csv.
df_agency_dimension = pd.read_csv('data/agency_dimension.csv')
df_claims_fact = pd.read_csv('data/claims_fact.csv')
df_claim_branches = pd.read_csv('data/claim_branches.csv')
df_claim_expense_dimension = pd.read_csv('data/claim_expense_dimension.csv')
df_communication_data = pd.read_csv('data/communication_data.csv')
df_company_dimension = pd.read_csv('data/company_dimension.csv')
df_driver_dimension = pd.read_csv('data/driver_dimension.csv')
df_location_dimension = pd.read_csv('data/location_dimension.csv')
df_policy_dimension = pd.read_csv('data/policy_dimension.csv',
parse_dates=['Policy Effective Date', 'Policy Expiration Date', 'Cancel Date'])
df_population_distribution = pd.read_csv('data/population_distribution.csv')
df_risk_fact = pd.read_csv('data/risk_fact.csv')
df_vehicles_dimension = pd.read_csv('data/vehicles_dimension.csv')
I check whether a driver has purchased more than one policy.
# Number of unique drivers in risk_fact
df_risk_fact['Driver ID'].nunique()
df_risk_fact[df_risk_fact['Total Premium'] > 0].shape[0]
Case assumption #9 mentions that some drivers have different Vehicle IDs in risk fact and claim fact tables. Use claims fact table as the source of truth.
df_vehicle_by_driver_risk_fact = df_risk_fact[['Driver ID', 'Vehicle ID']].drop_duplicates()
df_vehicle_by_driver_risk_fact.head(3)
df_vehicle_by_driver_claims_fact = df_claims_fact[['Claimant Id', 'Vehicle ID']].copy() \
.drop_duplicates() \
.rename(columns={
'Claimant Id': 'Driver ID'
})
df_vehicle_by_driver_claims_fact.head(3)
df_vehicle_comparison = df_vehicle_by_driver_risk_fact.merge(right=df_vehicle_by_driver_claims_fact, on='Driver ID', how='left') \
.rename(columns={
'Vehicle ID_x': 'Vehicle ID Risk Fact',
'Vehicle ID_y': 'Vehicle ID Claims Fact'
})
df_vehicle_comparison['Vehicle ID Claims Fact'] = df_vehicle_comparison['Vehicle ID Claims Fact'].astype('Int64')
df_vehicle_comparison = df_vehicle_comparison[df_vehicle_comparison['Vehicle ID Claims Fact'].notna()]
df_vehicle_comparison.head(3)
df_different_vehicles = df_vehicle_comparison[df_vehicle_comparison['Vehicle ID Risk Fact'] != df_vehicle_comparison['Vehicle ID Claims Fact']]
display(
df_different_vehicles.head() \
.style \
.applymap(highlight_style, subset=['Vehicle ID Risk Fact', 'Vehicle ID Claims Fact'])
)
print(f'{df_different_vehicles.shape[0]} drivers have different vehicle IDs in risk fact and claims fact tables.')
Create a new risk fact dataframe with updated vehicle IDs
df_risk_fact_vehicle_id_fixed = df_risk_fact.merge(right=df_different_vehicles[['Driver ID', 'Vehicle ID Claims Fact']], on='Driver ID', how='left')
df_risk_fact_vehicle_id_fixed['Vehicle ID'] = np.where(df_risk_fact_vehicle_id_fixed['Vehicle ID Claims Fact'].isna(), df_risk_fact_vehicle_id_fixed['Vehicle ID'], df_risk_fact_vehicle_id_fixed['Vehicle ID Claims Fact'])
display(
df_risk_fact_vehicle_id_fixed[df_risk_fact_vehicle_id_fixed['Vehicle ID Claims Fact'].notna()] \
.head() \
.style \
.applymap(highlight_style, subset=['Vehicle ID', 'Vehicle ID Claims Fact'])
)
df_risk_fact_vehicle_id_fixed.drop(columns=['Vehicle ID Claims Fact'], inplace=True)
Most date of birth values are in YYYY-MM-DD hh:mm:ss format. There are 2162 rows that have an invalid format (whitespace strings).
# Using errors='coerce' will force invalid formats into NaT
df_driver_dimension['Date of Birth'] = pd.to_datetime(df_driver_dimension['Date of Birth'], errors='coerce')
df_driver_dimension.head(3).style.applymap(new_column_style, subset='Date of Birth')
print(f'Column data type of "Date of Birth": {df_driver_dimension["Date of Birth"].dtype}')
print(f'Number of missing (NaT) "Date of Birth": {df_driver_dimension["Date of Birth"].isna().sum()}')
df_driver_dimension[df_driver_dimension['Date of Birth'].isna()] \
.head(3) \
.style \
.applymap(highlight_style, subset='Date of Birth')
We will have to take care of the missing birthdays later.
What is the duration of policies? Get a count of unique values.
df_policy_duration = df_policy_dimension['Policy Expiration Date'] - df_policy_dimension['Policy Effective Date']
df_policy_duration.value_counts()
All policies are either semiannual or annual. Extract this information to a new column.
df_policy_dimension['Duration'] = np.where(df_policy_duration > timedelta(days=200), 'Annual', 'Semiannual')
df_policy_dimension.head(3).style.applymap(new_column_style, subset=['Duration'])
df_policy_dimension['Duration'].value_counts()
There seems to be an even split between semiannual and annual policies.
For the risk fact table, there are 2 types of rows - purchase and refund. A positive premium indicates a purhcase of policy, while a negative premium indicates a refund (cancellation).
Let's merge purchase/refund rows into single rows, creating "Refund Amount" and "Net Premium" columns. The end goal is to have 1 row per driver.
# Create a dataframe of driver vs refund amount information
# Note that there is at maximum 1 refund row per driver
# If any driver has more than 1 refund, we will need to groupby "Driver ID" to sum the refund amounts
df_refund_by_driver = df_risk_fact_vehicle_id_fixed[df_risk_fact_vehicle_id_fixed['Total Premium'] < 0][['Driver ID', 'Total Premium']]
df_refund_by_driver.rename(columns={ 'Total Premium': 'Refund Amount' }, inplace=True)
df_refund_by_driver.head()
# Add refund amount to purchase rows (where Total Premium > 0)
df_risk_master = df_risk_fact_vehicle_id_fixed[df_risk_fact_vehicle_id_fixed['Total Premium'] > 0].merge(right=df_refund_by_driver, on='Driver ID', how='left')
# Add "Net Premium" column (Total Premium + Refund Amount)
# Treat np.nan as a 0 when adding two numbers
df_risk_master['Net Premium'] = df_risk_master['Total Premium'].add(df_risk_master['Refund Amount'], fill_value=0)
# Display added column
df_risk_master.head(3).style.applymap(new_column_style, subset=['Net Premium'])
Merge policy dimension to risk fact
df_risk_master = df_risk_master.merge(right=df_policy_dimension, how='inner', on='Policy Number').sort_values(by=['Policy Number', 'Driver ID'])
df_risk_master.head(3).style.applymap(new_column_style,
subset=df_policy_dimension.columns.to_series().drop('Policy Number'))
Merge driver dimension to risk fact
df_risk_master = df_risk_master.merge(right=df_driver_dimension, on='Driver ID')
df_risk_master.head(3).style.applymap(new_column_style,
subset=df_driver_dimension.columns.to_series().drop('Driver ID'))
Add age of driver based on date of birth and policy effective date
df_risk_master['Age'] = (df_risk_master['Policy Effective Date'].dt.year - df_risk_master['Date of Birth'].dt.year) - 1
# Increase age if effective date has passed the date of birth for that year
df_risk_master['Age'] = df_risk_master['Age'] + (df_risk_master['Policy Effective Date'].dt.strftime('%m%d') > df_risk_master['Date of Birth'].dt.strftime('%m%d'))
# Convert to integer type
df_risk_master['Age'] = df_risk_master['Age'].astype('Int64')
# Check results
df_risk_master[['Policy Effective Date', 'Date of Birth', 'Age']].head(3).style.applymap(new_column_style, subset=['Age'])
Create age bins
df_risk_master['Age Bin'] = pd.cut(df_risk_master['Age'],
bins=[9.5, 19.5, 29.5, 39.5, 49.5, 59.5, 69.5, np.Infinity],
labels=['10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70+'],
include_lowest=True,
right=False,
)
df_risk_master[['Driver ID', 'Policy Effective Date', 'Date of Birth', 'Age', 'Age Bin']].head(3) \
.style \
.applymap(new_column_style, subset='Age Bin')
Calculate monthly premium based on semiannual/annual classification we've done in policy dimension
df_risk_master['Monthly Premium'] = np.where(df_risk_master['Duration'] == 'Annual',
df_risk_master['Total Premium'] / 12,
df_risk_master['Total Premium'] / 6)
df_risk_master[['Policy Number', 'Driver ID', 'Total Premium', 'Duration', 'Monthly Premium']] \
.head(3) \
.style \
.applymap(new_column_style, subset=['Monthly Premium'])
Merge Claim Fact dataframe with Claim Expense Dimension.
df_claims_fact_merged = df_claims_fact.merge(right=df_claim_expense_dimension, on='Claim ID', how='left')
df_claims_fact_merged.head(3).style.applymap(new_column_style, subset=df_claim_expense_dimension.columns.to_series().drop('Claim ID'))
For each driver, calculate (1) sum of all claim amounts, (2) sum of all claim expenses, (3) number of claims. This step is to aggregate different claims by a single driver into one row.
df_claims_by_driver = df_claims_fact_merged.groupby(by='Claimant Id', as_index=False) \
.agg({
'Claim Description': 'count',
'Claim Amount': 'sum',
'Expense Amount': 'sum'
}) \
.rename(columns={
'Claim Description': 'Claim Count',
'Claimant Id': 'Driver ID',
'Expense Amount': 'Claim Expense'
})
df_claims_by_driver.head(3)
# Merge both policy_dimension and df_claims_by_driver into risk_fact
df_risk_master = df_risk_master.merge(right=df_claims_by_driver, on='Driver ID', how='left')
# Convert claim count to an integer type
df_risk_master['Claim Count'] = df_risk_master['Claim Count'].astype('Int64')
df_risk_master.head(3).style.applymap(new_column_style,
subset=df_claims_by_driver.columns.to_series().drop('Driver ID'))
Merge vehicles dimension into risk fact
df_vehicles_dimension.head(3)
df_risk_master = df_risk_master.merge(right=df_vehicles_dimension, on='Vehicle ID', how='left').rename(columns={
'Car Make': 'Make',
'Average Price': 'Vehicle Average Price'
})
df_risk_master[['Policy Number', 'Driver ID', 'Make', 'Model', 'Vehicle Average Price']] \
.head(3) \
.style \
.applymap(new_column_style, subset=['Make', 'Model', 'Vehicle Average Price'])
Add a column to indicate whether a policy was cancelled
df_risk_master['Cancelled'] = df_risk_master['Cancel Date'].notna()
df_risk_master[['Policy Number', 'Driver ID', 'Cancel Date', 'Cancelled']] \
.head(3) \
.style \
.applymap(new_column_style, subset=['Cancelled'])
Merge location dimension into risk fact
print('Before converting column data types')
print(f"df_risk_master['Location ID'].dtype: {df_risk_master['Location ID'].dtype}")
print(f"df_location_dimension['Location ID'].dtype: {df_location_dimension['Location ID'].dtype}")
However, the dtypes of df_risk_fact_cleaned and df_location_dimension are different as shown above. This will result in an error when merging the two dataframes by Location ID. Cast both to nullable integer types (Int64) first.
df_risk_master['Location ID'] = pd.to_numeric(df_risk_fact['Location ID'], errors='coerce').astype('Int64')
df_location_dimension['Location ID'] = df_location_dimension['Location ID'].astype('Int64')
print('After converting column data types')
print(f"df_risk_master['Location ID'].dtype: {df_risk_master['Location ID'].dtype}")
print(f"df_location_dimension['Location ID'].dtype: {df_location_dimension['Location ID'].dtype}")
Now, we should be able to merge.
df_risk_master = df_risk_master.merge(df_location_dimension, on='Location ID', how='left')
df_risk_master \
.head(3)\
.style \
.applymap(new_column_style, subset=['ZIP', 'Primary_City', 'State', 'County', 'Estimated_population_2013'])
Keep in mind that there is one row where the Location ID is missing.
df_risk_master[df_risk_master['Location ID'].isna()]
df_risk_master.rename(columns={
'Primary_City': 'City',
'Estimated_population_2013': 'Estimated Population'
}, inplace=True) \
df_risk_master.head(3)
Create a master claims fact dataframe.
df_claims_master = df_claims_fact_merged \
.merge(
right=df_claim_branches,
left_on='Claim Handling Branch',
right_on='Branch Id',
how='left'
) \
.drop(columns=['Claim Handling Branch']) \
.rename(columns={
'Expense Amount': 'Claim Expense',
'Branch Id': 'Branch ID',
'Claimant Id': 'Driver ID'
})
df_claims_master.head(3)
Remember, the data was artificially created for this project. This section explores any odd parts that seem unrealistic.
Check for drivers where the sum of premium grouped by that driver ID is negative.
premiums = df_risk_master.groupby('Driver ID')['Net Premium'].sum().to_frame().reset_index()
negative_premiums = premiums[premiums['Net Premium'] < 0]
display(negative_premiums.head().style.applymap(highlight_style, subset=['Net Premium']))
print(f'{negative_premiums.shape[0]} drivers have negative sum of premiums.')
There are policies that have refund amounts larger than the payment amount.
# Split the dataframe by whether net premium is positive negative
df_risk_fact_positive_premiums = df_risk_master[df_risk_master['Net Premium'] > 0]
df_risk_fact_negative_premiums = df_risk_master[df_risk_master['Net Premium'] < 0]
df_risk_fact_negative_premiums[['Policy Number', 'Driver ID', 'Total Premium', 'Refund Amount', 'Net Premium', 'Duration']] \
.head() \
.style \
.applymap(highlight_style, subset=['Duration'])
These negative net premium rows seem to have semianuual duration. Let's look at the exact proportions.
fig = make_subplots(1, 2, specs=[[{'type': 'domain'}, {'type': 'domain'}]], subplot_titles=['Positive Net Premiums', 'Negative Net Premiums'])
pie_labels = ['Annual', 'Semiannual']
fig.add_trace(
go.Pie(
labels=pie_labels,
values=df_risk_fact_positive_premiums['Duration'].value_counts().sort_index(),
text=pie_labels,
textposition='inside',
insidetextorientation='horizontal',
pull=[0.0, 0, 0.1, 0],
sort=False,
hole=0.3
), 1, 1
)
fig.add_trace(
go.Pie(
labels=pie_labels,
values=df_risk_fact_negative_premiums['Duration'].value_counts().sort_index(),
text=pie_labels,
textposition='inside',
insidetextorientation='horizontal',
pull=[0.0, 0, 0.1, 0],
sort=False,
hole=0.3,
), 1, 2
)
fig.update_layout(title_text='Durations by Positive/Negative Net Premiums')
fig.show()
df_refund_without_cancellation = df_risk_master[(df_risk_master['Refund Amount'].notna()) & (df_risk_master['Cancel Date'].isna())]
df_refund_without_cancellation = df_refund_without_cancellation[['Policy Number',
'Driver ID',
'Total Premium',
'Refund Amount',
'Policy Effective Date',
'Policy Expiration Date',
'Cancelled',
'Cancel Date',
'Duration']]
df_refund_without_cancellation \
.head() \
.style \
.applymap(highlight_style, subset=['Refund Amount', 'Cancelled'])
Find out how much percentage of the original premium has been refunded for each driver.
df_refund_without_cancellation['Refund Percentage'] = -df_refund_without_cancellation['Refund Amount'] / df_refund_without_cancellation['Total Premium']
df_refund_without_cancellation \
.head() \
.style \
.applymap(highlight_style, subset='Refund Percentage')
print(f'There are {df_refund_without_cancellation.shape[0]} refunds issued without a cancellation.')
A driver must be at least 15.5 years old to apply for a learner's permit in California. That means that anyone under 15 can't purchase an auto insurance policy.
df_driver_underage = df_risk_master[df_risk_master['Age'] < 15]
df_driver_underage = df_driver_underage[['Policy Number', 'Driver ID', 'Age', 'Policy Effective Date', 'Date of Birth', 'Marital Status', 'Primary Vehicle Use','Gender']]
df_driver_underage.head() \
.style \
.applymap(highlight_style, subset=['Age'])
print(f'There are {df_driver_underage.shape[0]} drivers less than 15 years old.')
print(f"{df_driver_underage[df_driver_underage['Marital Status'] == 'M'].shape[0]} of them are married according to the data.")
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Drivers Under 15')
fig.tight_layout(pad=3.0)
sns.countplot(x='Age',
data=df_driver_underage,
color='#444444',
ax=axs[0][0])
sns.countplot(x='Marital Status',
data=df_driver_underage,
color='#444444',
ax=axs[0][1])
sns.countplot(x='Gender',
data=df_driver_underage,
color='#444444',
ax=axs[1][0])
sns.countplot(x='Primary Vehicle Use',
data=df_driver_underage,
color='#444444',
ax=axs[1][1])
Usually, vehicles for year N is released a year before (N - 1). For example, Corolla 2021 will be released in 2020. So, it is okay in some cases to have vehicle model year being larger than the policy effective year. However, it doesn't make sense if the vehicle model year is larger than the policy effective year by 2 or larger.
df_invalid_vehicle_model_years = df_risk_master[df_risk_master['Vehicle Model Year'] >= df_risk_master['Policy Effective Date'].dt.year + 2] \
[['Policy Number', 'Vehicle ID', 'Vehicle Model Year', 'Policy Effective Date']] \
df_invalid_vehicle_model_years['Policy Effective Year'] = df_invalid_vehicle_model_years['Policy Effective Date'].dt.year
display(
df_invalid_vehicle_model_years.head() \
.style \
.applymap(highlight_style, subset=['Vehicle Model Year', 'Policy Effective Year'])
)
print(f'There are {df_invalid_vehicle_model_years.shape[0]} rows where the vehicle model year is larger than the policy effective year by at least two.')
Before starting to explore the factors that affect the premiums, let's first determine whether using the total premium is appropriate. To do that, I create box plots by number of violations on total premium and monthly premium.
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
fig.suptitle('Total vs Monthly premium distributions by # of violations')
sns.boxplot(x='Number of Violations',
y='Total Premium',
data=df_risk_master[df_risk_master['Net Premium'] > 0],
palette='Blues',
ax=ax1)
sns.boxplot(x='Number of Violations',
y='Monthly Premium',
data=df_risk_master[df_risk_master['Net Premium'] > 0],
palette='Blues',
ax=ax2)
The left plot does show a slight sign of positive linear relationship between the number of violations and total premium. However, the distributions on the left plot are skewed towards the right top since the total premium columns contain both semiannual/annual policies.
fig, axs = plt.subplots(4, 2, figsize=(16, 24))
fig.tight_layout(pad=3.0)
def display_premium_distribution(data, by, ax):
return sns.boxplot(data=data,
x=by,
y='Monthly Premium',
palette='Blues',
ax=ax)
display_premium_distribution(data=df_risk_master, by='Gender', ax=axs[0][0])
display_premium_distribution(data=df_risk_master, by='Primary Vehicle Use', ax=axs[0][1])
display_premium_distribution(data=df_risk_master, by='Marital Status', ax=axs[1][0])
display_premium_distribution(data=df_risk_master, by='Duration', ax=axs[1][1])
display_premium_distribution(data=df_risk_master, by='Number of Violations', ax=axs[2][0])
display_premium_distribution(data=df_risk_master, by='Number of Accidents', ax=axs[2][1])
display_premium_distribution(data=df_risk_master, by='Vehicle Model Year', ax=axs[3][0])
display_premium_distribution(data=df_risk_master, by='Age Bin', ax=axs[3][1])
The left plot does show a slight sign of positive linear relationship between the number of violations and total premium. However, the distributions on the left plot are skewed towards the right top since the total premium columns contain both semiannual/annual policies.
Replace spaces in column names with underscores
df_risk_renamed = df_risk_master.copy() \
.rename(columns={ col: col.strip().replace(' ', '_') for col in df_risk_master.columns })
df_risk_renamed.head(3)
linreg_formula = 'Monthly_Premium ~ Number_of_Violations + ' + \
'Number_of_Accidents + Vehicle_Model_Year + ' \
'Duration + ' \
'C(Gender) + C(Age_Bin) + C(Marital_Status) + C(Primary_Vehicle_Use) + ' + \
'C(Make) + C(Model)'
results = smf.ols(linreg_formula, data=df_risk_renamed).fit()
results.summary()
Here, I look at the loss ratios grouped by each dependent variable.
overall_combined_ratio_with_expenses = (df_risk_master['Claim Amount'].sum() + df_risk_master['Claim Expense'].sum()) / df_risk_master['Net Premium'].sum()
print(f'Overall combined ratio without claim expenses is {overall_combined_ratio_with_expenses * 100:.4f}%.')
overall_loss_ratio_without_expenses = df_risk_master['Claim Amount'].sum() / df_risk_master['Net Premium'].sum()
print(f'Overall loss ratio with claim expenses is {overall_loss_ratio_without_expenses * 100:.4f}%.')
A few functions are defined here to calculate and visualize performance metrics.
def get_performance_metrics(df_risk_merged, by):
'''
A function to calculate the loss ratios grouped by provided column names
Parameters:
df_risk_merged (DataFrame): A merged/cleaned DataFrame of Risk Fact,
Policy Dimension, Driver Dimension, Claims Fact,
Claim Expense Dimension
The Dataframe should have one row per driver
by (List<str>): A list of column names to group by
Returns:
DataFrame: A dataframe containing performance metrics (Claim Expense Ratio, Average Monthly Premium, Average Claim Amount, Loss Ratio)
'''
df_g = df_risk_merged.groupby(by=by).agg({
'Net Premium': 'sum',
'Claim Amount': 'sum',
'Claim Count': 'sum',
'Claim Expense': 'sum',
'Monthly Premium': 'sum',
'Record ID': 'size'
}).reset_index()
# Record ID column was used to count the number of drivers within each group
df_g.rename(columns={
'Record ID': 'Number of Drivers',
'Claim Amount': 'Total Claim Amount',
'Claim Expense': 'Total Claim Expense',
'Net Premium': 'Total Premium'
}, inplace=True)
# Calculate loss/combined ratios
df_g['Loss Ratio'] = df_g['Total Claim Amount'] / df_g['Total Premium']
df_g['Combined Ratio'] = (df_g['Total Claim Amount'] + df_g['Total Claim Expense']) / df_g['Total Premium']
# Sort result by combined ratio in descending order
df_g.sort_values('Combined Ratio', ascending=False, inplace=True)
# Calculate other metrics
df_g['Driver %'] = df_g['Number of Drivers'] / df_risk_merged.shape[0]
df_g['Average Monthly Premium'] = df_g['Monthly Premium'] / df_g['Number of Drivers']
df_g['Claim %'] = df_g['Claim Count'] / df_g['Number of Drivers']
df_g['Average Claim Amount'] = df_g['Total Claim Amount'] / df_g['Claim Count']
df_g['Average Claim Expense'] = df_g['Total Claim Expense'] / df_g['Claim Count']
# Reorder columns for readability
column_order = by if type(by) == list else [by]
column_order.extend(['Number of Drivers',
'Driver %',
'Total Premium',
'Monthly Premium',
'Claim Count',
'Claim %',
'Total Claim Amount',
'Total Claim Expense',
'Average Monthly Premium',
'Average Claim Amount',
'Average Claim Expense',
'Loss Ratio',
'Combined Ratio'])
df_g = df_g[column_order]
return df_g
def display_performance_metrics(df):
'''
Display performance metrics with background bar plots
Parameters:
df (DataFrame): A dataframe containing the performance metrics
Returns:
DataFrame: A styled dataframe with bar plots as a background
'''
# Create a cloned instance of the DataFrame to avoid side-effects
df = df.copy()
# Columns that are hidden (not displayed)
hidden_columns = ['Monthly Premium']
hidden_columns_matched = [col for col in hidden_columns if col in df.columns]
# Drop hidden columns
df = df.drop(columns=hidden_columns_matched)
# Metric columns display order
metric_columns_order = ['Number of Drivers',
'Driver %',
'Total Premium',
'Average Monthly Premium',
'Claim Count',
'Claim %',
'Total Claim Amount',
'Total Claim Expense',
'Average Claim Amount',
'Average Claim Expense',
'Loss Ratio',
'Combined Ratio']
# Only include the metric columns that exist
metric_columns_order = [col for col in metric_columns_order if col in df.columns]
# Non-metric columns order display order
non_metric_columns_order = df.columns.to_series().copy() \
.drop(metric_columns_order)
# Display non-metric columns first, follwed by metric columns
display_column_order = non_metric_columns_order.append(pd.Series(metric_columns_order))
df = df[display_column_order]
return df.copy() \
.style \
.format({
'Driver %': '{:.1%}',
'Total Premium': '${:,.0f}',
'Average Monthly Premium': '${:,.0f}',
'Claim %': '{:.1%}',
'Total Claim Amount': '${:,.0f}',
'Total Claim Expense': '${:,.0f}',
'Average Claim Amount': '${:,.0f}',
'Average Claim Expense': '${:,.0f}',
'Loss Ratio': '{:.1%}',
'Combined Ratio': '{:.1%}',
}) \
.bar(subset=['Loss Ratio'],
color='#FFEBEB',
vmin=0,
vmax=min(1, df['Loss Ratio'].max() * 1.2)) \
.bar(subset=['Combined Ratio'],
color='#FFDFDF',
vmin=0,
vmax=min(1, df['Combined Ratio'].max() * 1.2)) \
.applymap(lambda x: 'color: #BB0000; background-color: #fff; border-bottom: 1px solid white;',
subset=['Loss Ratio']) \
.applymap(lambda x: 'color: #D42323; background-color: #fff; font-weight: bold; border-bottom: 1px solid #fff;',
subset=['Combined Ratio'])
df_loss_ratio_by_marital_status = get_performance_metrics(df_risk_master, 'Marital Status')
display_performance_metrics(df_loss_ratio_by_marital_status)
df_loss_ratio_by_gender = get_performance_metrics(df_risk_master, 'Gender')
display_performance_metrics(df_loss_ratio_by_gender)
df_loss_ratio_by_vehicle_use = get_performance_metrics(df_risk_master, 'Primary Vehicle Use')
display_performance_metrics(df_loss_ratio_by_vehicle_use)
This is an example to calculate the loss ratios grouped by multiple columns.
df_loss_ratio_by_marital_gender_use = get_performance_metrics(df_risk_master, ['Marital Status', 'Gender', 'Primary Vehicle Use'])
display_performance_metrics(df_loss_ratio_by_marital_gender_use)
df_loss_ratio_by_duration = get_performance_metrics(df_risk_master, 'Duration')
display_performance_metrics(df_loss_ratio_by_duration)
df_loss_ratio_by_underwriter = get_performance_metrics(df_risk_master, 'Underwriting Company ID')
df_loss_ratio_by_underwriter = df_loss_ratio_by_underwriter \
.merge(right=df_company_dimension[['Underwriting Company ID', 'Underwriting Agency Name', 'Agency City']], on='Underwriting Company ID')
display_performance_metrics(df_loss_ratio_by_underwriter)
df_loss_ratio_by_agency = get_performance_metrics(df_risk_master, 'Agency ID')
df_loss_ratio_by_agency = df_loss_ratio_by_agency.merge(right=df_agency_dimension[['Agency ID', 'Agency Name']], on='Agency ID')
display_performance_metrics(df_loss_ratio_by_agency)
df_loss_ratio_by_age_bin = get_performance_metrics(df_risk_master, 'Age Bin')
display_performance_metrics(df_loss_ratio_by_age_bin)
df_loss_ratio_by_vehicle_make = get_performance_metrics(df_risk_master, 'Make')
display_performance_metrics(df_loss_ratio_by_vehicle_make)
df_loss_ratio_by_vehicle_id = get_performance_metrics(df_risk_master, 'Vehicle ID')
# Add "Make Model" column
df_loss_ratio_by_vehicle_id = df_loss_ratio_by_vehicle_id.merge(df_vehicles_dimension[['Vehicle ID', 'Car Make', 'Model']], on='Vehicle ID', how='left')
df_loss_ratio_by_vehicle_id['Make Model'] = df_loss_ratio_by_vehicle_id['Car Make'] + ' ' + df_loss_ratio_by_vehicle_id['Model']
# Drop redundant columns
df_loss_ratio_by_vehicle_id.drop(columns=['Car Make', 'Model'], inplace=True)
display_performance_metrics(df_loss_ratio_by_vehicle_id)
df_loss_ratio_by_county = get_performance_metrics(df_risk_master, 'County')
display_performance_metrics(df_loss_ratio_by_county)
Only display top 20 cities with the largest number of policyholders
df_loss_ratio_by_city = get_performance_metrics(df_risk_master, 'City')
display_performance_metrics(df_loss_ratio_by_city.nlargest(20, 'Number of Drivers').sort_values('Combined Ratio', ascending=False))
# display_performance_metrics(df_loss_ratio_by_city)
Only display top 20 ZIP codes with the largest number of policyholders
df_loss_ratio_by_ZIP = get_performance_metrics(df_risk_master, 'ZIP')
display_performance_metrics(df_loss_ratio_by_ZIP.nlargest(20, 'Number of Drivers').sort_values('Combined Ratio', ascending=False))
# display_performance_metrics(df_loss_ratio_by_ZIP)
Looking at the loss ratios by vehicle age may be a better analysis. However, I've included this since a few groups are doing analysis on the loss ratios by vehicle model year.
df_loss_ratio_by_model_year = get_performance_metrics(df_risk_master, 'Vehicle Model Year')
display_performance_metrics(df_loss_ratio_by_model_year.sort_values('Vehicle Model Year'))
df_claims_by_branch = df_claims_master.groupby(['Branch ID', 'Branch Name']).agg({
'Claim Amount': 'sum',
'Claim ID': 'count',
'Claim Expense': 'sum'
}).reset_index().rename(columns={ 'Claim ID': 'Claim Count' })
df_claims_by_branch['Branch'] = df_claims_by_branch['Branch Name'].str.replace(r' CA Branch', '')
df_claims_by_branch.drop(columns=['Branch Name'], inplace=True)
df_claims_by_branch['Average Claim Amount'] = df_claims_by_branch['Claim Amount'] / df_claims_by_branch['Claim Count']
df_claims_by_branch['Average Claim Expense'] = df_claims_by_branch['Claim Expense'] / df_claims_by_branch['Claim Count']
df_claims_by_branch['Claim Expense Ratio'] = df_claims_by_branch['Claim Expense'] / df_claims_by_branch['Claim Amount']
df_claims_by_branch
fig = make_subplots(1, 2, specs=[[{'type': 'domain'}, {'type': 'domain'}]], subplot_titles=['By Number of Claims', 'By Total Claim Amount'])
fig.add_trace(
go.Pie(
labels=df_claims_by_branch['Branch'],
values=df_claims_by_branch['Claim Count'],
text=df_claims_by_branch['Branch'],
textposition='inside',
insidetextorientation='horizontal',
pull=[0.0, 0, 0.05, 0],
sort=False,
hole=0.3
), 1, 1
)
fig.add_trace(
go.Pie(
labels=df_claims_by_branch['Branch'],
values=df_claims_by_branch['Claim Amount'],
text=df_claims_by_branch['Branch'],
textposition='inside',
insidetextorientation='horizontal',
pull=[0.0, 0, 0.05, 0],
sort=False,
hole=0.3,
), 1, 2
)
fig.update_layout(title_text='Claims Breakdown by Count & Amount')
fig.show()
Claim expense ratio is defined as Claim Expense / Claim Amount.
fig = px.bar(
df_claims_by_branch,
x='Branch',
y='Claim Expense Ratio',
title='Claim Expense Ratio by Claim Branch',
text=(df_claims_by_branch['Claim Expense Ratio'] * 100).round(2).astype(str) + '%',
color='Branch',
labels={
'Branch': 'Claim Branch'
},
range_y=[0, 0.2]
)
fig.update_layout(
yaxis=dict(
gridcolor='#f5f5f5',
gridwidth=1,
zerolinecolor='#f5f5f5',
zerolinewidth=1,
),
yaxis_tickformat='%',
paper_bgcolor='white',
plot_bgcolor='white',
)
fig.update_traces(
textposition='outside'
)
fig.update_layout(
autosize=False,
width=700
)
fig.show()
df_claims_by_description = df_claims_master.groupby('Claim Description').agg({
'Claim ID': 'count',
'Claim Amount': 'sum',
'Claim Expense': 'sum'
}) \
.rename(columns={
'Claim ID': 'Count'
}) \
.reset_index()
df_claims_by_description.loc[1, 'Claim Description'] = 'Damge from impacts with animals'
df_claims_by_description['Average Claim Amount'] = df_claims_by_description['Claim Amount'] / df_claims_by_description['Count']
df_claims_by_description['Average Claim Expense'] = df_claims_by_description['Claim Expense'] / df_claims_by_description['Count']
df_claims_by_description['Claim Expense Ratio'] = df_claims_by_description['Claim Expense'] / df_claims_by_description['Claim Amount']
df_claims_by_description
# Claim types by count
fig_types_by_count = px.pie(
df_claims_by_description,
values='Count',
names='Claim Description',
title='Claim Types by Count',
hole=0.4,
color_discrete_sequence=px.colors.sequential.Blues
)
fig_types_by_count.update_traces(textposition='outside', textinfo='percent+label')
fig_types_by_count.update_layout(showlegend=False)
fig_types_by_count.show()
# Claim types by amount
fig_types_by_amount = px.pie(
df_claims_by_description,
values='Claim Amount',
names='Claim Description',
title='Claim Types by Amount',
hole=0.4,
color_discrete_sequence=px.colors.sequential.Blues
)
fig_types_by_amount.update_traces(textposition='outside', textinfo='percent+label')
fig_types_by_amount.update_layout(showlegend=False)
fig_types_by_amount.show()
I define a "sale" as a new driver signing up with one of the agencies. The driver can either start a new policy or join an existing policy (i.e. join a policy of another family member).
df_policy_activation = df_risk_master[['Policy Effective Date', 'Agency ID']].copy()
df_policy_activation['YearMonth'] = df_policy_activation['Policy Effective Date'].dt.strftime('%Y-%m')
df_policy_activation.head(3)
First, count the number of new policies
df_num_policies_activated_by_date = df_policy_activation \
.groupby('YearMonth') \
.size() \
.to_frame() \
.reset_index() \
.rename(columns={
'Policy Effective Date': 'Date',
0: 'Activation'
})
df_num_policies_activated_by_date.head(3)
fig = px.line(
df_num_policies_activated_by_date,
x='YearMonth',
y='Activation',
labels={
'YearMonth': 'Month'
},
title='Number of New Drivers Who Activated a Policy by Year-Month'
)
fig.show()
pickle files¶df_risk_master.to_pickle('df_risk_master.pkl')
df_claims_master.to_pickle('df_claims_master.pkl')